##  merge.texts.pl
##
##  This takes the positive cases (FPOS) output of merge.results.pl as well as the original file
##  listing class.filter.files used in MID.NLP.newscases.pl, and writes the original stories to
##  a file; a tab-delimited summary is also written. Stories are located using the Factiva unique
##  story identifier.
##
##  TO RUN PROGRAM:
##
##  perl merge.texts.pl prefix
##
##  where prefix is the prefix for the file "$ARGV[0].positive.txt"
##
##  INPUT FILE:
##
##  $file_list : This file contains the names of all of the files that will be searched -- this
##               is assumed to have been originally produced by a ls > class.filter.files and
##               then edited by MID.CD.newcases.pl to produce a clean list of text files
##               (though if the list isn't clean, the results should be mostly harmless)
##
##  "$ARGV[0].positive.txt": the output of merge.results.pl
##
##  OUTPUT FILES:
##
##  ">texts.$ARGV[0]": The original texts with identification headers
##
##  ">summary.$ARGV[0]": Tab-delimited dyad codes, date, story ID, headline
##
##  PROGRAMMING NOTES:
##
##  1. The date-validity check in line 134 recognizes only dates in the interval 2000-2019. Also
##     note that the check is fairly simple and would still permit, e.g. 20051499
##
##  SYSTEM REQUIREMENTS
##  This program has been successfully run under Mac OS 10.5; it is standard perl
##  so it should also run in Unix or Windows.
##
##  PROVENANCE:
##  Programmer: Philip A. Schrodt
##              Dept of Political Science
##              Pennsylvania State University
##              227 Pond Laboratory
##              University Park, PA, 16802 U.S.A.
##              http://eventdata.psu.edu
##
##  Copyright (c) 2009  Philip A. Schrodt.  All rights reserved.
##
##   Redistribution and use in source and binary forms, with or without modification,
##   are permitted under the terms of the GNU General Public License:
##   http://www.opensource.org/licenses/gpl-license.html
##
##  Programming supported by National Science Foundation Political Science Program Grant
##  SES-0719634 "Improving the Efficiency of Militarized Interstate Dispute Data Collection using
##  Automated Textual Analysis" and SES-0924240, "MID4: Updating the Militarized Dispute Data Set
##  2002-2010."
##
##  Report bugs to: schrodt@psu.edu
##
##  For plausible indenting of this source code, set the tab size in your editor to "2"
##
##  REVISION HISTORY:
##  26-July-08:  Initial version
##  16-Dec-09 :  Adds COW codes and scores
##
##  ----------------------------------------------------------------------------------

#!/usr/local/bin/perl
# ======== globals =========== #

$file_list   = "class.filter.files";
$code_file   = "MID.ISO.CODES.txt";

# ======== main program =========== #

if (length($ARGV[0]) < 4) {
  print "File name is required to run the program\n";
  exit;
}

# read COW codes
%COWCodes = ();  # initialize hash
open(DAT, $code_file) || die("Could not open code conversion  file $code_file");
$word = <DAT>; # skip header
print "mk1\n";
while ($word = <DAT>) {
  @field = split("\t",$word);
  chomp($field[2]);
#  print "$field[1]  $field[2]\n";
  $COWCodes{$field[2]} = $field[1];
}
close(DAT);
#while ( ($key, $value) = each(%COWCodes) ) {print "$key => $value\n";}
#exit;

open(FDAT,$file_list )  or die "Can\'t open file list $file_list ; error $!";
$input_name = "$ARGV[0].positive.txt";
open(FPOS,$input_name)  or die "Can\'t open list of positives $ARGV[0]; error $!";

$output_name = ">texts.$ARGV[0]";
open(FOUT,$output_name)  or die "Can\'t open FOUT output file; error $!";
$summary_name = ">summary.$ARGV[0]";
open(FSUM,$summary_name)  or die "Can\'t open FSUM output file; error $!";
$badate_name = ">badate.$ARGV[0]";
open(FBAD,$badate_name)  or die "Can\'t open FBAD output file; error $!";

chomp($fname = <FDAT>);
open(FIN,$fname)  or die "Can\'t open FDAT input file $fname; error $!";

while ($line = <FPOS>) {
  if ($line =~ m/Positive cases:/) {last;}
  $info = substr($line, index($line,"#")+2);
  $score = substr($line, 0, index($line,"#")-5);  # -5 skips some of the decimal places in the score
#  print "\'$id\'\n";
#  print FSUM $id;
  @fields = split(/\t/,$info);
  $head = $fields[1];
  $id = $fields[0];
  # convert ISO to COW codes; skip records containing non-COW states
  if (exists($COWCodes{$fields[3]})) { $src = $COWCodes{$fields[3]}; }
  else {next;}
  chomp($fields[4]);
  if (exists($COWCodes{$fields[4]})) { $tar = $COWCodes{$fields[4]}; }
  else {next;}

  print "\'$id\'\n";
  $news = <FIN>;
  while ($news !~ m/$id/) {
    if (!($news = <FIN>)) {
      close(FIN);
      chomp();
      if ($fname = <FDAT>) {
        chomp($fname);
#        if ($fname !~ m/^Reut/) {next;}
        open(FIN,$fname)  or die "Can\'t open text input file $fname; error $!";
        print "Searching $fname\n";
      }
      else {
        print "Error: Could not find record $id : \"$head\"\n";
        exit;
      }
    }
  }
  print FOUT "\n$id\n";
  print FOUT "$head  $src-$tar\n";
  chomp($fields[3]);
  if (($info =~ m/(200\d\d\d\d\d)/) || ($info =~ m/(201\d\d\d\d\d)/))  {  # check that date is [vaguely] valid for 2000-2019
    print FSUM "$src\t$tar\t$1\t$fields[0]\t$fields[1]\t$fields[2]\t$score\n";

    while  ($news = <FIN>) {  # get date
      if ($news =~ m/\(c\)/) {last;}
      if (eof(FIN)) { print "Error: Unexpected eof in search for end of header while reading $fname\n"; exit;}
      if ($news =~ m/\d, /) {
        print FOUT $news,"\n";
  #      chomp($news);
  #      print FSUM $news,"\t",$info;
      }
    }

    while (length($news)>2) { # skip to a blank line
      $news = <FIN>;
      if (eof(FIN)) { print "Error: Unexpected eof in search for start of text while reading $fname\n"; exit;}
    }

    until ($news =~ m/--------------------------------/) {  # print the whole story
      $news = <FIN>;
      print FOUT $news;
      if (eof(FIN)) { print "Error: Unexpected eof in search for end of story while reading $fname\n"; last;}
    }

  }
  else {   # then skip write to FOUT if we don't have a good date...
    print FBAD "$fields[3]\t$fields[4]\t$fields[0]\t$fields[1]\n";
  }
}

close(FSUM) or die "Can\'t close FSUM output file ; error $!";
close(FOUT) or die "Can\'t close FOUT output file ; error $!";
close(FDAT) or die "Can\'t close FDAT input file ; error $!";
close(FIN) or die "Can\'t close FIN input file ; error $!";
print "Program has finished!\n";
